Error-rate assessment

Take all the input files, attach their unique_id, find the ones with replicates, then look at the median of 2*SD (~95% spread around measurement mean) for all measurements to decided on an approriate significance threshold

file_vector = str_remove(list.files(path = "Input", pattern="*.csv", recursive = T), ".csv")

error_set = lapply(paste0("Input/", list.files(path = "Input", pattern="*.csv", recursive = T)), read_csv, skip = 5, col_names = F, col_select = 1:2, show_col_types=F)

for(i in 1:length(error_set)){
  error_set[[i]][,3] = file_vector[i]
}

error_set = do.call("rbind", error_set)

colnames(error_set) = c("Genotype", "Phenotype", "unique_id")

error_rates = error_set[which(duplicated(error_set %>% 
  unite(unique_geno, c(1,3), sep = "_", remove = F) %>%
  pull(unique_geno))),] %>%
  group_by(Genotype) %>%
  summarise(pheno_mean = mean(Phenotype),
            pheno_sd = sd(Phenotype))

## % of SDs for measurements that are less than 1.5-fold threshold

sum(error_rates$pheno_sd*2 <= log10(1.5)) / length(error_rates$pheno_sd)
## [1] 0.5170068
median(error_rates$pheno_sd*2)
## [1] 0.1705538
summary(error_rates$pheno_sd*2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.08511 0.17055 0.71405 0.74638 3.73452
error_rates %>%
  ggplot(aes(x = pheno_sd*2)) +
  geom_histogram(bins = 50) +
  geom_vline(xintercept = log10(1.5), lty = 2) +
  theme_classic() +
  theme(text = element_text(size=18), axis.text = element_text(size = 16, color = "black"),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none")

## Fold-Change Data

Here we look at the fold-change data used in the input files. This allows us to see positive and negative functional effects across the landscapes.

fit_land = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="observed_values.csv", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("id", "effect", "muts", "junk1", "junk2", "enz")))

## Make fit_land have IDs by finding where muts are 0, i.e. start of new dataset, and multiplying
## Those values by the folder names to generate ID vector

fit_land_spots = c(which(fit_land$muts == 0), (dim(fit_land)[1] + 1)) # Add end of dataset
fit_land_counts = c()

for(i in 1:(length(fit_land_spots) - 1)) {
  
  if(i == 1) {
    fit_land_counts = c(fit_land_counts, fit_land_spots[i + 1] - 1)
  } else {
    fit_land_counts = c(fit_land_counts, fit_land_spots[i + 1] - fit_land_spots[i])
  }

}

## This assumes folders are read in the order that they are listed

fit_land_ids = rep(str_remove_all(list.files(path = "Output", pattern="observed_values.csv", recursive = T), "/observed_values.csv"), fit_land_counts)

fit_land$unique_id = fit_land_ids

fit_land = fit_land %>% filter(muts != 0)

The data shows the following distribution

c(sum(fit_land$effect < log10(1/1.5)), sum(fit_land$effect >= log10(1/1.5) & fit_land$effect <= log10(1.5)), sum(fit_land$effect > log10(1.5)))
## [1] 360 292 807

And these percentages

c(sum(fit_land$effect < log10(1/1.5)), sum(fit_land$effect >= log10(1/1.5) & fit_land$effect <= log10(1.5)), sum(fit_land$effect > log10(1.5))) / sum(c(sum(fit_land$effect < log10(1/1.5)), sum(fit_land$effect >= log10(1/1.5) & fit_land$effect <= log10(1.5)), sum(fit_land$effect > log10(1.5))))
## [1] 0.2467443 0.2001371 0.5531186

Histogram

fit_land_plot = fit_land %>%
  ggplot(aes(x = effect)) +
  geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression(italic("F")), 
       y = "Genotype Counts") +
  scale_y_continuous(breaks = c(0, 50, 100, 150, 200, 250)) +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fit_land_plot

#ggsave("supp_fig_1.svg", plot = fit_land_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")

Ratio Data

First we import the ratio datasets, which contains ratio values between predicted function based on the additive model vs observed function. These ratio values serve as a general metric for “epistasis” as defined by the additive model.

all_ratios = lapply(paste0("Output/", list.files(path = "Output", pattern="ratio_export.csv", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("id", "effect", "muts", "cv", "colors"))

all_ratios_id = str_remove_all(list.files(path = "Output", pattern="ratio_export.csv", recursive = T), "/ratio_export.csv")

temp_col_length = length(all_ratios[[1]]) + 1

for(i in 1:length(all_ratios)) {
  all_ratios[[i]][, temp_col_length] = rep(all_ratios_id[i], dim(all_ratios[[i]])[1]) ## Add partial_id as a column to each ratio df in list
}

rm(temp_col_length)

## Make into a tibble
all_ratios = do.call("rbind", all_ratios)
colnames(all_ratios) = c("id", "effect", "muts", "cv", "colors", "partial_id")

Filtering ratio data to only include genotypes with 2+ mutations. Outputting length of vector.

all_ratios = all_ratios %>% filter(muts > 1)

length(all_ratios$effect) # examined muts
## [1] 1245

Number and % of synergystic genotypes (i.e. ratio > 1.5-fold)

sum(all_ratios$effect > 1.5) # synergystic
## [1] 502
paste0("Synergystic ", round(sum(all_ratios$effect > 1.5)/length(all_ratios$effect)*100,2), "%", collapse = "") # synergystic %
## [1] "Synergystic 40.32%"

Number and % of neutral genotypes (i.e. ratio > 1.5-fold)

sum(all_ratios$effect >= 1/1.5 & all_ratios$effect <= 1.5) # antagonistic
## [1] 379
paste0("Neutral ", round(sum(all_ratios$effect >= 1/1.5 & all_ratios$effect <= 1.5)/length(all_ratios$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 30.44%"

Number and % of antagonistic genotypes (i.e. ratio > 1.5-fold)

sum(all_ratios$effect < 1/1.5) # antagonistic
## [1] 364
paste0("Antagonistic ", round(sum(all_ratios$effect < 1/1.5)/length(all_ratios$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 29.24%"

Histogram

Shows distribution of positive and negative epistasis across the landscapes. The significance threshold is 1.5-fold

all_ratios_plot = all_ratios %>%
  ggplot(aes(x = log10(effect))) +
  geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
  theme_classic() +
  scale_fill_manual(values = c("grey")) +
  labs(x = expression("Observed" *italic(" F")*" / Predicted"*italic(" F")), 
       y = "Genotype Counts") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")


all_ratios_plot

#ggsave("supp_fig_2.svg", plot = all_ratios_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")

Also the ratios table. Here we create bins from -Inf to log10(1/1.5), i.e. -0.176, then the neutral range between log10(1/1.5) to log10(1.5), i.e. 0.176, then the positive range from 0.176 to Inf.

all_ratios %>%
  mutate(sign = cut(log10(effect), breaks = c(-Inf, log10(1/1.5), log10(1.5), Inf))) %>%
  count(sign) %>%
  mutate(percent = round(n/sum(n) * 100, 2))
## # A tibble: 3 × 3
##   sign               n percent
##   <fct>          <int>   <dbl>
## 1 (-Inf,-0.176]    364    29.2
## 2 (-0.176,0.176]   379    30.4
## 3 (0.176, Inf]     502    40.3

We can do the same thing for individual orders: 2nd, 3rd, and 4th order

2nd order

## 2nd step only

all_ratios_2 = all_ratios %>% filter(muts == 2)

length(all_ratios_2$effect) # examined muts
## [1] 419
sum(all_ratios_2$effect > 1.5) # synergystic
## [1] 135
paste0("Synergystic ", round(sum(all_ratios_2$effect > 1.5)/length(all_ratios_2$effect)*100, 2), "%", collapse = "") # synergystic %
## [1] "Synergystic 32.22%"
sum(all_ratios_2$effect >= 1/1.5 & all_ratios_2$effect <= 1.5) # antagonistic
## [1] 160
paste0("Neutral ", round(sum(all_ratios_2$effect >= 1/1.5 & all_ratios_2$effect <= 1.5)/length(all_ratios_2$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 38.19%"
sum(all_ratios_2$effect < 1/1.5) # antagonistic
## [1] 124
paste0("Antagonistic ", round(sum(all_ratios_2$effect < 1/1.5)/length(all_ratios_2$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 29.59%"

3rd order

## 3rd step only

all_ratios_3 = all_ratios %>% filter(muts == 3)

length(all_ratios_3$effect) # examined muts
## [1] 438
sum(all_ratios_3$effect > 1.5) # synergystic
## [1] 160
paste0("Synergystic ", round(sum(all_ratios_3$effect > 1.5)/length(all_ratios_3$effect)*100, 2), "%", collapse = "") # synergystic %
## [1] "Synergystic 36.53%"
sum(all_ratios_3$effect >= 1/1.5 & all_ratios_3$effect <= 1.5) # antagonistic
## [1] 120
paste0("Neutral ", round(sum(all_ratios_3$effect >= 1/1.5 & all_ratios_3$effect <= 1.5)/length(all_ratios_3$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 27.4%"
sum(all_ratios_3$effect < 1/1.5) # antagonistic
## [1] 158
paste0("Antagonistic ", round(sum(all_ratios_3$effect < 1/1.5)/length(all_ratios_3$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 36.07%"

4th order

## 4th step only

all_ratios_4 = all_ratios %>% filter(muts == 4)

length(all_ratios_4$effect) # examined muts
## [1] 267
sum(all_ratios_4$effect > 1.5) # synergystic
## [1] 127
paste0("Synergystic ", round(sum(all_ratios_4$effect > 1.5)/length(all_ratios_4$effect)*100, 2), "%", collapse = "") # synergystic %
## [1] "Synergystic 47.57%"
sum(all_ratios_4$effect >= 1/1.5 & all_ratios_4$effect <= 1.5) # antagonistic
## [1] 72
paste0("Neutral ", round(sum(all_ratios_4$effect >= 1/1.5 & all_ratios_4$effect <= 1.5)/length(all_ratios_4$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 26.97%"
sum(all_ratios_4$effect < 1/1.5) # antagonistic
## [1] 68
paste0("Antagonistic ", round(sum(all_ratios_4$effect < 1/1.5)/length(all_ratios_4$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 25.47%"

Positional Functional Contribution Data

Here we import the functional contribution of single positions (not combinations), which resembles first order analysis across all datasets.

## Reads all csvs in collated folder and combines them

d = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="*2d_box.csv", recursive = T)), read_csv, show_col_types=F, col_names = c("positions", "identity", "mut", "effects", "enzyme", "type", "cond")))


d1 = d %>% filter_all(any_vars(!is.na(.))) # removes rows with all NA before making unique ID

d1 = d1 %>%
  unite("unique_id", c(positions, enzyme, type, cond), remove = F)

d1 = d1 %>%
  unite("partial_id", c(enzyme, type, cond), remove = F)

Histogram

Spread of functional contributions of the positions. This shows whether introducing a mutation in a given position impacts the function positively or negatively (or neutral) across all backgrounds

d1_plot = d1 %>%
  ggplot(aes(x = effects)) +
  geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression(Delta*italic("F")), 
       y = "Genotype Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")


d1_plot

#ggsave("fig_1A.svg", plot = d1_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")

What are the percentages of each in the categories?

d1 %>% 
  count(effects < log10(1/1.5),
        effects > log10(1.5)) %>%
  mutate(prop = n/sum(n)*100)
## # A tibble: 3 × 4
##   `effects < log10(1/1.5)` `effects > log10(1.5)`     n  prop
##   <lgl>                    <lgl>                  <int> <dbl>
## 1 FALSE                    FALSE                   1447  35.6
## 2 FALSE                    TRUE                    1760  43.3
## 3 TRUE                     FALSE                    857  21.1

Density plots for each trajectory by enzymes

d1_plot_enz = d1 %>%
  ggplot(aes(x = effects, fill = enzyme)) +
  geom_density(alpha = 0.8, color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression(Delta*italic("F")), 
       y = "Genotype Count") +
  #scale_y_continuous(breaks = c(0, 50, 100, 150, 200, 250)) +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))


d1_plot_enz

### For WT-normalized data

Load the same kind of data as d1 except that has been normalized do the \(\Delta\)F in the WT background

wt_rel_df = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="*wt_rel.csv", recursive = T)), read_csv, show_col_types=F, col_names = c("positions", "identity", "mut", "effects", "enzyme", "type", "cond")))


wt_rel_df = wt_rel_df %>% filter_all(any_vars(!is.na(.))) # removes rows with all NA before making unique ID

wt_rel_df = wt_rel_df %>%
  unite("unique_id", c(positions, enzyme, type, cond), remove = F) %>%
  unite("partial_id", c(enzyme, type, cond), remove = F)

Then we plot it

wt_rel_plot = wt_rel_df %>%
  filter(mut > 1) %>% # filter out first mutations which are by definition 0 delta F
  ggplot(aes(x = effects)) +
  geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression(Delta*italic("F ")*"(Single Mutant Normalized)"), 
       y = "Genotype Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")


wt_rel_plot

#ggsave("fig_1A.svg", plot = d1_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")

And again for density plots for each enzyme

wt_rel_plot_enz = wt_rel_df %>%
  mutate(Enzyme = enzyme) %>%
  filter(mut > 1) %>%
  ggplot(aes(x = effects, fill = Enzyme)) +
  geom_density(alpha = 0.8, color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression(Delta*italic("F ")*"(Single Mutant Normalized)"), 
       y = "Density") +
  scale_x_continuous(limits = c(-2.5, 2.5)) + # Same data omitted for better resolution
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))


wt_rel_plot_enz
## Warning: Removed 73 rows containing non-finite values (`stat_density()`).

Given this spread which percentage is significantly (using log10 1.5-fold cutoff) negative, neutral, and positive in that order

wt_rel_df %>%
  filter(mut > 1) %>%
  mutate(sign = cut(effects, breaks = c(-Inf, log10(1/1.5), log10(1.5), Inf))) %>%
  count(sign) %>%
  mutate(percent = round(n/sum(n) * 100, 2))
## # A tibble: 3 × 3
##   sign               n percent
##   <fct>          <int>   <dbl>
## 1 (-Inf,-0.176]    942    24.5
## 2 (-0.176,0.176]  1438    37.4
## 3 (0.176, Inf]    1470    38.2

Heterogenity in Spread

Looks at 2*SD of each position/combination across all orders to determine functional heterogeneity.

This gives us a quick look at the general spread of heterogenity values and their mean.

idio_d = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="idio_df", recursive = T)), read_csv, show_col_types=F))

traj_col = sapply(str_split(list.files(path = "Output", pattern="idio_df", recursive = T), "/"),"[[",1)

traj_col_len = sapply(lapply(paste0("Output/", list.files(path = "Output", pattern="idio_df", recursive = T)), read_csv, show_col_types=F), dim)[1,]

traj_col_full = c()
for(i in 1:length(traj_col)) {
  traj_col_full = c(traj_col_full, rep(traj_col[i], traj_col_len[i]))
}

idio_d$traj = traj_col_full

Order 1

idio_d_order_1 = idio_d %>%
  filter(mutations == "Order 1") %>%
  ggplot(aes(x = idiosync)) +
  geom_histogram(binwidth = 0.08, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  labs(x = expression("2"*sigma), 
       y = "Position Count") +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

idio_d_order_1

#ggsave("fig_1B.svg", idio_d_order_1, width = 180/2, height = 247/4, dpi = 300, units = "mm")

First order positions which have a heterogeneity index > log10 1.5-fold, 2-fold, 5-fold, and 10-fold for adaptive trajectories

idio_d %>%
  filter(mutations == "Order 1") %>%
  mutate(idiosync_1.5 = idiosync > log10(1.5),
         idiosync_2 = idiosync > log10(2),
         idiosync_5 = idiosync > log10(5),
         idiosync_10 = idiosync > log10(10)) %>%
  group_by(mutations) %>%
  summarise(idiosync_sig_1.5 = sum(idiosync_1.5),
            idiosync_sig_2 = sum(idiosync_2),
            idiosync_sig_5 = sum(idiosync_5),
            idiosync_sig_10 = sum(idiosync_10),
            idiosync_total = length(idiosync),
            idiosync_1.5 = sum(idiosync_1.5)/length(idiosync_1.5) * 100,
            idiosync_2 = sum(idiosync_2)/length(idiosync_2) * 100,
            idiosync_5 = sum(idiosync_5)/length(idiosync_5) * 100,
            idiosync_10 = sum(idiosync_10)/length(idiosync_10) * 100
            ) %>%
  knitr::kable()
mutations idiosync_sig_1.5 idiosync_sig_2 idiosync_sig_5 idiosync_sig_10 idiosync_total idiosync_1.5 idiosync_2 idiosync_5 idiosync_10
Order 1 211 205 150 114 214 98.59813 95.79439 70.09346 53.27103

Order 2-4

Histogram to show heterogenity at Orders 2, 3, and 4 with the log10 1.5-fold significance threshold

Order 2

idio_d_plot_2 = idio_d %>%
  filter(mutations == "Order 2") %>%
  ggplot(aes(x = idiosync)) +
  geom_histogram(fill = "#edae49", color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  labs(x = expression("log"[10]*" 2"*sigma), 
       y = "Combination Count") +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

idio_d_plot_2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggsave("fig_2A_1.svg", idio_d_plot_2, width = 180/3, height = 247/4, dpi = 300, units = "mm")

Order 3

idio_d_plot_3 = idio_d %>%
  filter(mutations == "Order 3") %>%
  ggplot(aes(x = idiosync)) +
  geom_histogram(fill = "#edae49", color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  labs(x = expression("log"[10]*" 2"*sigma), 
       y = "Combination Count") +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

idio_d_plot_3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggsave("fig_2A_2.svg", idio_d_plot_3, width = 180/3, height = 247/4, dpi = 300, units = "mm")

Order 4

idio_d_plot_4 = idio_d %>%
  filter(mutations == "Order 4") %>%
  ggplot(aes(x = idiosync)) +
  geom_histogram(fill = "#edae49", color = "black", lwd = 0.1) +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  labs(x = expression("log"[10]*" 2"*sigma), 
       y = "Combination Count") +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

idio_d_plot_4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggsave("fig_2A_3.svg", idio_d_plot_4, width = 180/3, height = 247/4, dpi = 300, units = "mm")

Order 2-4 in Facet Grid

idio_d_multi_plot = idio_d %>%
  filter(mutations != "Order 1" & mutations != "Order 5" & mutations != "Order 6") %>%
  ggplot(aes(x = idiosync)) +
  geom_histogram(color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  labs(x = expression("2"*sigma), 
       y = "Combination Count") +
  facet_grid(~ mutations) +
  scale_x_continuous(limits = c(0,5)) +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

idio_d_multi_plot # Data removed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 51 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).

#ggsave("fig_2A.svg", idio_d_multi_plot, width = 180, height = 247/4, dpi = 300, units = "mm")

What percentage of positions remain significantly heterogeneous (1.5-fold) at these orders?

idio_d %>%
  filter(mutations == "Order 2" | mutations == "Order 3" | mutations == "Order 4") %>%
  mutate(idiosync = idiosync > log10(1.5)) %>%
  group_by(mutations) %>%
  summarise(idiosync_sig = sum(idiosync),
            idiosync_total = length(idiosync),
            idiosync = sum(idiosync)/length(idiosync) * 100
            )
## # A tibble: 3 × 4
##   mutations idiosync_sig idiosync_total idiosync
##   <chr>            <int>          <int>    <dbl>
## 1 Order 2            413            419     98.6
## 2 Order 3            420            438     95.9
## 3 Order 4            240            245     98.0

What percentage of positions remain significantly heterogeneous (1.5-, 2-, 5-, and 10-fold) at all orders?

idio_d %>%
  filter(mutations == "Order 1" | mutations == "Order 2" | mutations == "Order 3" | mutations == "Order 4") %>%
  mutate(idiosync_1.5 = idiosync > log10(1.5),
         idiosync_2 = idiosync > log10(2),
         idiosync_5 = idiosync > log10(5),
         idiosync_10 = idiosync > log10(10)) %>%
  group_by(mutations) %>%
  summarise(idiosync_sig_1.5 = sum(idiosync_1.5),
            idiosync_sig_2 = sum(idiosync_2),
            idiosync_sig_5 = sum(idiosync_5),
            idiosync_sig_10 = sum(idiosync_10),
            idiosync_total = length(idiosync),
            idiosync_1.5 = sum(idiosync_1.5)/length(idiosync_1.5) * 100,
            idiosync_2 = sum(idiosync_2)/length(idiosync_2) * 100,
            idiosync_5 = sum(idiosync_5)/length(idiosync_5) * 100,
            idiosync_10 = sum(idiosync_10)/length(idiosync_10) * 100
            ) %>%
  knitr::kable()
mutations idiosync_sig_1.5 idiosync_sig_2 idiosync_sig_5 idiosync_sig_10 idiosync_total idiosync_1.5 idiosync_2 idiosync_5 idiosync_10
Order 1 211 205 150 114 214 98.59813 95.79439 70.09346 53.27103
Order 2 413 387 253 172 419 98.56802 92.36277 60.38186 41.05012
Order 3 420 374 199 136 438 95.89041 85.38813 45.43379 31.05023
Order 4 240 204 122 61 245 97.95918 83.26531 49.79592 24.89796

Heterogenity in Sign

Collects all \(\Delta\)Function data for all orders

higher_order_list = list.files(path = "Output", pattern="higher_box", recursive = T)

higher_df = data.frame()
for(higher_file in 1:length(higher_order_list)){
  Condition = str_split(str_split(higher_order_list[higher_file], "_")[[1]][3], "/")[[1]][1]
  Measurement = str_split(higher_order_list[higher_file], "_")[[1]][2]
  Enzyme = str_split(higher_order_list[higher_file], "_")[[1]][1]
  appending_file = read_csv(paste0("Output/", higher_order_list[higher_file]), show_col_types=F)
  appending_file$Condition = Condition
  appending_file$Measurement = Measurement
  appending_file$Enzyme = Enzyme
  higher_df = bind_rows(higher_df, appending_file)
}


higher_df = higher_df %>%
  unite("unique_id", c(pos, Condition, Measurement, Enzyme), remove = F) %>%
  unite("partial_id", c(Enzyme, Measurement, Condition), remove = F)

higher_all_sign_check = higher_df %>%
  filter(str_count(genotype, "x")/str_length(genotype) != 1) %>% # Remove interactions with no other backgrounds
  mutate(mutations = factor(paste("Order", mutations, sep = " "))) %>%
  group_by(unique_id, mutations) %>%
  summarise(min_effect = min(avg),
            max_effect = max(avg))

Assigning positive, neutral, negative, negative-neutral, positive-neutral, and negative-positive ‘types’ based on log10 1.5-fold threshold to every Order

threshold = 1.5

types = c()
for(each in 1:dim(higher_all_sign_check)[1]){
  if(higher_all_sign_check$min_effect[each] < log10(1/threshold)){
    # Negative branch
    if(higher_all_sign_check$max_effect[each] < log10(1/threshold)) {
      # Full negative
      types = c(types, "Negative")
    } else if(higher_all_sign_check$max_effect[each] <= log10(threshold)) {
      # Negative Neutral
      types = c(types, "Neutral Negative")
    } else {
      # Negative Positive
      types = c(types, "Positive Negative")
    }
  } else {
    # Neutral or Positive
    if(higher_all_sign_check$min_effect[each] > log10(threshold)) {
      # Full positive
      types = c(types, "Positive")
    } else if(higher_all_sign_check$max_effect[each] > log10(threshold)) {
      # Neutral Positive
      types = c(types, "Neutral Positive")
    } else {
      # Neutral
      types = c(types, "Neutral")
    }
  }
}

higher_all_sign_check$type = factor(types)

All outputs show position/combination numbers in each category, followed by the total sum of all probed positions/combinations at that order, and finally percentages of each category

Order 1

order_checker = higher_all_sign_check %>% 
  filter(mutations == "Order 1") %>%
  pull(type) 

order_checker_levels = c("Negative", "Neutral", "Positive", "Neutral Negative", "Neutral Positive", "Positive Negative")

summary(order_checker) 
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##                 4                 2                17                35 
##          Positive Positive Negative 
##                21               135
sum(summary(order_checker)) 
## [1] 214
summary(order_checker) / length(order_checker) * 100
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##         1.8691589         0.9345794         7.9439252        16.3551402 
##          Positive Positive Negative 
##         9.8130841        63.0841121
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))

pie_order_1 = ggplot(pie_df, aes(x="", y=values, fill=names))+ 
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  theme_minimal() + 
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))

pie_order_1

#ggsave("fig_1C.svg", pie_order_1, width = 180/2, height = 247/4, dpi = 300, units = "mm")

Order 2

order_checker = higher_all_sign_check %>% 
  filter(mutations == "Order 2") %>%
  pull(type) 

summary(order_checker) 
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##                 5                 5                37                63 
##          Positive Positive Negative 
##                13               296
sum(summary(order_checker)) 
## [1] 419
summary(order_checker) / length(order_checker) * 100
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##          1.193317          1.193317          8.830549         15.035800 
##          Positive Positive Negative 
##          3.102625         70.644391
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))

pie_order_2 = ggplot(pie_df, aes(x="", y=values, fill=names))+ 
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  theme_minimal() + 
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))

pie_order_2

#ggsave("fig_2B_1.svg", pie_order_2, width = 180/2, height = 247/4, dpi = 300, units = "mm")

Order 3

order_checker = higher_all_sign_check %>% 
  filter(mutations == "Order 3") %>%
  pull(type) 

summary(order_checker) 
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##                15                10                63                59 
##          Positive Positive Negative 
##                51               240
sum(summary(order_checker)) 
## [1] 438
summary(order_checker) / length(order_checker) * 100
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##          3.424658          2.283105         14.383562         13.470320 
##          Positive Positive Negative 
##         11.643836         54.794521
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))

pie_order_3 = ggplot(pie_df, aes(x="", y=values, fill=names))+ 
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  theme_minimal() + 
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))

pie_order_3

#ggsave("fig_2B_2.svg", pie_order_3, width = 180/2, height = 247/4, dpi = 300, units = "mm")

Order 4

order_checker = higher_all_sign_check %>% 
  filter(mutations == "Order 4") %>%
  pull(type) 

summary(order_checker) 
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##                11                 5                50                42 
##          Positive Positive Negative 
##                11               126
sum(summary(order_checker)) 
## [1] 245
summary(order_checker) / length(order_checker) * 100
##          Negative           Neutral  Neutral Negative  Neutral Positive 
##          4.489796          2.040816         20.408163         17.142857 
##          Positive Positive Negative 
##          4.489796         51.428571
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))

pie_order_4 = ggplot(pie_df, aes(x="", y=values, fill=names))+ 
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  theme_minimal() + 
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))

pie_order_4

#ggsave("fig_2B_3.svg", pie_order_4, width = 180/2, height = 247/4, dpi = 300, units = "mm")

All Orders

Raw Values

higher_all_sign_check_table = higher_all_sign_check %>% 
  group_by(mutations) %>%
  count(type) %>%
  pivot_wider(names_from = type,
              values_from = n) %>%
  mutate(Total = sum(across("Negative":"Positive Negative"))) 

higher_all_sign_check_table %>% knitr::kable()
mutations Negative Neutral Neutral Negative Neutral Positive Positive Positive Negative Total
Order 1 4 2 17 35 21 135 214
Order 2 5 5 37 63 13 296 419
Order 3 15 10 63 59 51 240 438
Order 4 11 5 50 42 11 126 245
Order 5 5 4 12 18 6 39 84
Order 6 1 2 2 2 1 6 14

Percentages

higher_all_sign_check_table %>%
  mutate(Negative = Negative/Total*100,
         Neutral = Neutral/Total*100,
         `Neutral Negative` = `Neutral Negative`/Total*100,
         `Neutral Positive` = `Neutral Positive`/Total*100,
         Positive = Positive/Total*100,
         `Positive Negative` = `Positive Negative`/Total*100) %>%
  knitr::kable()
mutations Negative Neutral Neutral Negative Neutral Positive Positive Positive Negative Total
Order 1 1.869159 0.9345794 7.943925 16.35514 9.813084 63.08411 214
Order 2 1.193317 1.1933174 8.830549 15.03580 3.102625 70.64439 419
Order 3 3.424657 2.2831050 14.383562 13.47032 11.643836 54.79452 438
Order 4 4.489796 2.0408163 20.408163 17.14286 4.489796 51.42857 245
Order 5 5.952381 4.7619048 14.285714 21.42857 7.142857 46.42857 84
Order 6 7.142857 14.2857143 14.285714 14.28571 7.142857 42.85714 14

WT points to Mean Deviation

Purpose of this analysis was to determine the difference in \(\Delta\)Function between the WT-background contribution of a given position/combination and the mean contribution of a given position or combination

Histogram

The spread of all positional contribution vs mean differences for each Order (1-4)

higher_df_mean = higher_df %>%
  filter(mutations < 5) %>%
  filter(str_count(genotype, "x")/str_length(genotype) != 1) %>% # Remove interactions with no other backgrounds
  group_by(unique_id, partial_id) %>%
  summarise(avg = mean(avg))

devs = c()
sign_devs = c()
cur_mutations = c()
for(i in 1:dim(higher_df_mean)[1]) {
  curr_id = higher_df_mean$unique_id[i]
  devs = c(devs, abs( (higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(avg)) - (higher_df_mean$avg[i]) ))
  cur_mutations = c(cur_mutations, (higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(mutations)))
  sign_devs = c(sign_devs, ifelse( (((higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(avg)) > log10(1.5) & (higher_df_mean$avg[i]) < log10(1/1.5)) | ((higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(avg)) < log10(1/1.5) & (higher_df_mean$avg[i]) > log10(1.5))), T, F)) ## Check if WT is positive while average is negative or vice versa
}


devs_df_wt = data.frame(devs = devs, muts = cur_mutations, sign_devs = sign_devs)

## Add unique ID if the WT deviates from mean

devs_df_wt = devs_df_wt %>%
  mutate(unique_id = higher_df_mean$unique_id[row_number()]) %>%
  mutate(partial_id = higher_df_mean$partial_id[row_number()])

WT sign heterogenity for Order 1

How many WT points have sign deviation from the mean at the 1st order?

paste0(length(which(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs)))
## [1] "7/214"
paste0(round(sum(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "3.27%"

WT sign heterogenity for Order 2

How many WT points have sign deviation from the mean at the 2nd order?

paste0(length(which(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs)))
## [1] "53/419"
paste0(round(sum(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "12.65%"

WT sign heterogenity for Order 3

How many WT points have sign deviation from the mean at the 3rd order?

paste0(length(which(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs)))
## [1] "34/438"
paste0(round(sum(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "7.76%"

WT sign heterogenity for Order 4

How many WT points have sign deviation from the mean at the 4th order?

paste0(length(which(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs)))
## [1] "12/245"
paste0(round(sum(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "4.9%"

Next we move away from sign to normal distance heterogenity

WT heterogenity for Order 1

devs_df_wt_1_plot = devs_df_wt %>%
  filter(muts == 1) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
       y = "Position Count") +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

devs_df_wt_1_plot

#ggsave("fig_1D.svg", devs_df_wt_1_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")

WT heterogenity for Order 2

devs_df_wt_2 = devs_df_wt %>%
  filter(muts == 2) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  theme_classic() +
  #scale_x_continuous(limits = c(0, 3)) +
  labs(x = expression("Deviation from Combinatorial Mean (log"[10]*" "*epsilon*")"),
       y = "Combination Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

devs_df_wt_2

#ggsave("fig_sup_4B_1.svg", devs_df_wt_2, width = 180/3, height = 247/4, dpi = 300, units = "mm")

WT heterogenity for Order 3

devs_df_wt_3 = devs_df_wt %>%
  filter(muts == 3) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  #scale_x_continuous(limits = c(0, 3)) +
  theme_classic() +
  labs(x = expression("Deviation from Combinatorial Mean (log"[10]*" "*epsilon*")"),
       y = "Combination Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

devs_df_wt_3

#ggsave("fig_sup_4B_2.svg", devs_df_wt_3, width = 180/3, height = 247/4, dpi = 300, units = "mm")

WT heterogenity for Order 4

devs_df_wt_4 = devs_df_wt %>%
  filter(muts == 4) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  theme_classic() +
  #scale_x_continuous(limits = c(0, 3)) + # Same scale used later
  labs(x = expression("Deviation from Combinatorial Mean (log"[10]*" "*epsilon*")"),
       y = "Combination Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

devs_df_wt_4

#ggsave("fig_sup_4B_3.svg", devs_df_wt_4, width = 180/3, height = 247/4, dpi = 300, units = "mm")

WT heterogenity for Order 2-4 in facet grid

devs_df_wt_multi = devs_df_wt %>%
  filter(muts == 2 | muts == 3 | muts == 4) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.1, color = "black", lwd = 0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  facet_grid(~ muts) +
  scale_x_continuous(limits = c(0, 3)) +
  theme_classic() +
  labs(x = expression("Deviation from Combinatorial Mean ("*epsilon*")"),
       y = "Combination Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

devs_df_wt_multi
## Warning: Removed 8 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).

#ggsave("fig_2C.svg", devs_df_wt_multi, width = 180, height = 247/4, dpi = 300, units = "mm")

Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by significance threshold (log10 1.5-fold)

devs_df_wt %>%
  mutate(muts = factor(muts)) %>%
  group_by(muts) %>%
  summarise(percent = sum(devs > log10(1.5)) / length(devs),
            outlier = sum(devs > log10(1.5)),
            total = length(devs)) %>%
  knitr::kable()
muts percent outlier total
1 0.6775701 145 214
2 0.6443914 270 419
3 0.5593607 245 438
4 0.6244898 153 245

Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by multiple significance threshold (log10 1.5-, 2-, 5-, and 10-fold)

devs_df_wt %>%
  mutate(Order = factor(muts)) %>%
  group_by(Order) %>%
  summarise(percent_1.5 = round(sum(devs > log10(1.5)) / length(devs) *100, 1),
            outlier_1.5 = sum(devs > log10(1.5)),
            percent_2 = round(sum(devs > log10(2)) / length(devs) *100, 1),
            outlier_2 = sum(devs > log10(2)),
            percent_5 = round(sum(devs > log10(5)) / length(devs) *100, 1),
            outlier_5 = sum(devs > log10(5)),
            percent_10 = round(sum(devs > log10(10)) / length(devs) *100, 1), 
            outlier_10 = sum(devs > log10(10)),
            total = length(devs)) %>%
  knitr::kable()
Order percent_1.5 outlier_1.5 percent_2 outlier_2 percent_5 outlier_5 percent_10 outlier_10 total
1 67.8 145 51.4 110 19.2 41 9.3 20 214
2 64.4 270 51.3 215 21.5 90 9.8 41 419
3 55.9 245 35.2 154 19.2 84 13.9 61 438
4 62.4 153 33.9 83 9.0 22 6.9 17 245

All points to Mean Deviation

Purpose of this analysis was to determine the difference in \(\Delta\)Function between functional contribution of a given position/combination in any background and the mean contribution of a given position or combination

Histogram

The spread of all positional contribution vs mean differences for each Order (1-4)

devs = c()
cur_mutations = c()
cur_partial_id = c()
for(i in 1:dim(higher_df_mean)[1]) {
  curr_id = higher_df_mean$unique_id[i]
  devs = c(devs, abs( (higher_df %>% filter(unique_id == curr_id) %>% pull(avg)) - (higher_df_mean$avg[i] ) ))
  cur_mutations = c(cur_mutations, (higher_df %>% filter(unique_id == curr_id) %>% pull(mutations)))
  cur_partial_id = c(cur_partial_id, (higher_df %>% filter(unique_id == curr_id) %>% pull(partial_id)))
}

devs_df = data.frame(devs = devs, muts = cur_mutations, partial_id = cur_partial_id)

All heterogenity for Order 1

devs_df %>%
  filter(muts == 1) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
       y = "Genotype Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

All heterogenity for Order 2

devs_df %>%
  filter(muts == 2) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
       y = "Genotype Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

All heterogenity for Order 3

devs_df %>%
  filter(muts == 3) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
       y = "Genotype Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

All heterogenity for Order 4

devs_df %>%
  filter(muts == 4) %>%
  ggplot(aes(x = devs)) +
  geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
  geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
  geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
  theme_classic() +
  labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
       y = "Genotype Count") +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")

Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by significance threshold (log10 1.5-fold)

devs_df %>%
  mutate(Order = factor(muts)) %>%
  group_by(Order) %>%
  summarise(percent = sum(devs > log10(1.5)) / length(devs),
            outlier = sum(devs > log10(1.5)),
            total = length(devs)) %>%
  knitr::kable()
Order percent outlier total
1 0.5888287 2393 4064
2 0.5286815 2470 4672
3 0.4963038 1477 2976
4 0.5598214 627 1120

Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by many significance thresholds (log10 1.5-, 2-, 5-, and 10-fold)

devs_df %>%
  mutate(Order = factor(muts)) %>%
  group_by(Order) %>%
  summarise(percent_1.5 = sum(devs > log10(1.5)) / length(devs),
            outlier_1.5 = sum(devs > log10(1.5)),
            percent_2 = sum(devs > log10(2)) / length(devs),
            outlier_2 = sum(devs > log10(2)),
            percent_5 = sum(devs > log10(5)) / length(devs),
            outlier_5 = sum(devs > log10(5)),
            percent_10 = sum(devs > log10(10)) / length(devs),
            outlier_10 = sum(devs > log10(10)),
            total = length(devs)) %>%
  knitr::kable()
Order percent_1.5 outlier_1.5 percent_2 outlier_2 percent_5 outlier_5 percent_10 outlier_10 total
1 0.5888287 2393 0.4040354 1642 0.1483760 603 0.0809547 329 4064
2 0.5286815 2470 0.3488870 1630 0.1316353 615 0.0747003 349 4672
3 0.4963038 1477 0.3114919 927 0.1202957 358 0.0796371 237 2976
4 0.5598214 627 0.3455357 387 0.0991071 111 0.0589286 66 1120

Prediction

Import the absolute error files for each data set

all_mae = lapply(paste0("Output/", list.files(path = "Output", pattern="aic_*", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("MAE", "Last MAE", "Step", "Enzyme", "Linear MAE", "Linear Last MAE"))

for(element in 1:length(all_mae)){
  all_mae[[element]]$ID = str_split(list.files(path = "Output", pattern="aic_*", recursive = T)[element], "/")[[1]][1]
}

all_mae = do.call("rbind", all_mae)

For some reason the absolute error files for OXA-48 CAZ and PIP Traj 1 are including the step 4 model, which uses all 4 of the coefficients in the model for prediction, which is results in a perfect prediction for the WT-background model. I can just remove it manually here

all_mae = all_mae %>% filter(!((ID == "OXA-48_ic50_CAZtraj1" | ID == "OXA-48_ic50_PIPtraj1") & Step == 4))

Some statistics for the AE for each model. Now F is converted to fold-change in function (but remains WT-normalized)

all_mae %>% 
  group_by(Step) %>% 
  summarise(wtbg_mean = 10^mean(`Last MAE`),
            wtbg_median = 10^median(`Last MAE`),
            lin_mean = 10^mean(`Linear Last MAE`),
            lin_median = 10^median(`Linear Last MAE`)) %>%
  knitr::kable()
Step wtbg_mean wtbg_median lin_mean lin_median
1 19.507536 8.008615 2.889882 2.302982
2 111.737428 63.740988 2.016922 1.765125
3 51.067872 8.952979 1.273370 1.192176
4 28.215338 4.499601 1.105085 1.066184
5 7.322787 1.313971 1.094891 1.080794
6 1.730950 1.000000 1.114261 1.106314
7 1.000000 1.000000 1.087920 1.087920

Are the means significantly less than 1.5-fold?

all_mae %>% 
  group_by(Step) %>% 
  summarise(wtbg_pval = t.test(`Last MAE`, mu = log10(1.5), alternative = "less")$p.value,
            lin_mean = t.test(`Linear Last MAE`, mu = log10(1.5), alternative = "less")$p.value) %>%
  filter(Step < 5)
## # A tibble: 4 × 3
##    Step wtbg_pval lin_mean
##   <dbl>     <dbl>    <dbl>
## 1     1     1.00  1.00e+ 0
## 2     2     1.00  9.99e- 1
## 3     3     1.00  1.33e- 6
## 4     4     0.994 2.43e-12

Only the linear model gets there for the 3rd order onwards.

After loading and assigning ID corresponding to each landscape value (should be 45) we can plot linear and WT-model MAE’s with their constituents

all_mae_long = all_mae %>%
  pivot_longer(cols = c('Last MAE', 'Linear Last MAE'), names_to = "Model", values_to = "AE") %>%
    filter(Step < 5)
  
mae_ggplot = all_mae_long %>%
  ggplot(aes(x = Step, y = AE, color = Model)) +
  geom_point(size = 1, position=position_jitterdodge(jitter.width = 0.1)) +
  stat_summary(data = all_mae_long %>% filter(Model == "Last MAE"), fun=mean, colour="black", geom="crossbar", width=0.2, position = position_nudge(x = -0.19, y = 0), size = 0.3) +
  stat_summary(data = all_mae_long %>% filter(Model == "Linear Last MAE"), fun=mean, colour="black", geom="crossbar", width=0.2, position = position_nudge(x = 0.19, y = 0),size = 0.3) +
   stat_summary(data = all_mae_long %>% filter(Model == "Last MAE"), fun=median, colour="darkred", geom="crossbar", width=0.2, position = position_nudge(x = -0.19, y = 0), size = 0.3) +
  stat_summary(data = all_mae_long %>% filter(Model == "Linear Last MAE"), fun=median, colour="darkred", geom="crossbar", width=0.2, position = position_nudge(x = 0.19, y = 0),size = 0.3) +
  geom_hline(yintercept = log10(1.5), lty = 2, size = 0.2) +
  scale_color_manual(values = c("#edae49", "#4988ed")) +
  labs(x = "Order of the Model", 
       y = expression("Absolute Error of Predicted"*italic(" F"))) +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none") +
  coord_cartesian(ylim=c(0, 6)) # Zoom without affecting means and medians
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mae_ggplot

#ggsave("fig_3B.svg", mae_ggplot, width = 180/2, height = 247/4, dpi = 300, units = "mm")

This plot omits some data from being viewed, however plotted means and medians are accurate. The figure is edited outside of the code for technical

What is the distribution of predicted vs non-predicted at each order, not just the means?

all_mae_long %>%
  group_by(Step, Model) %>%
  summarise(pred= length(AE[AE < log10(1.5)]), 
            non_pred= length(AE[AE >= log10(1.5)])) %>%
  mutate(pred_ratio = round(pred/(pred+non_pred)*100,2)) %>%
  knitr::kable()
## `summarise()` has grouped output by 'Step'. You can override using the
## `.groups` argument.
Step Model pred non_pred pred_ratio
1 Last MAE 6 39 13.33
1 Linear Last MAE 13 32 28.89
2 Last MAE 5 40 11.11
2 Linear Last MAE 15 30 33.33
3 Last MAE 5 40 11.11
3 Linear Last MAE 34 11 75.56
4 Last MAE 2 21 8.70
4 Linear Last MAE 22 1 95.65

Trajectory

Seeing how adaptive trajectories are affected by epistasis

First I defined the adaptive trajectories. These are hardcoded based on the dataset I am working with.

adaptive_trajectories = c("DHFR_ki_trajg", 
                          "DHFR_ki_trajr",
                          "MPH_catact_ZnPTM",
                          "NfsA_ec50_2039",
                          "NfsA_ec50_3637",
                          "OXA-48_ic50_CAZtraj1",
                          "OXA-48_ic50_CAZtraj2",
                          "OXA-48_ic50_CAZtraj3",
                          "PTE_catact_2NH",
                          "TEM_MIC_weinreich",
                          "DHFR_ic75_palmer")

Then I need to create a global trajectory data frame

all_traj = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="traj_epi_*", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("genotype", "avg", "pos", "mutations", "likely", "enzyme_name", "measure_type", "cond")))

all_traj = all_traj %>% unite("unique_id", c(enzyme_name, measure_type, cond), remove = F)

Next I need to filter landscapes by whether they are adaptive or not

adaptive_traj = all_traj %>% filter(unique_id %in% adaptive_trajectories)

Which genotypes in the adaptive landscapes for the most accessible trajectory, as defined by the boolean variable likely, are POSITIVELY epistatic and epistatic in general?

Note: some trajectories don’t go to the last variant, so we explore epistasis of all other genotypes which may go beyond last variant

adaptive_traj_favored = adaptive_traj %>% filter(mutations > 1 & likely) %>% pull(avg)

sum(adaptive_traj_favored > log10(1.5)) / length(adaptive_traj_favored)
## [1] 0.3809524
sum(adaptive_traj_favored > log10(1.5) | adaptive_traj_favored < log10(1/1.5)) / length(adaptive_traj_favored)
## [1] 0.6428571

How many positive epistatic vs all epistatic genotypes vs all genotypes encountered in these landscapes?

sum(adaptive_traj_favored > log10(1.5))
## [1] 16
sum(adaptive_traj_favored > log10(1.5) | adaptive_traj_favored < log10(1/1.5))
## [1] 27
length(adaptive_traj_favored)
## [1] 42

Which genotypes in the non-adaptive landscapes for the favored trajectory are NEGATIVELY epistatic and epistatic in general? Also total genotypes that are not most accessible, with percentages of negative epistasis and epistasis in general.

`%notin%` <- Negate(`%in%`)

non_adaptive_traj_favored = all_traj %>% filter(unique_id %notin% adaptive_trajectories & mutations > 1 & !likely) %>% pull(avg)

sum(non_adaptive_traj_favored < log10(1/1.5))
## [1] 204
sum(non_adaptive_traj_favored > log10(1.5) | non_adaptive_traj_favored < log10(1/1.5))
## [1] 408
length(non_adaptive_traj_favored)
## [1] 557
sum(non_adaptive_traj_favored < log10(1/1.5)) / length(non_adaptive_traj_favored)
## [1] 0.3662478
sum(non_adaptive_traj_favored > log10(1.5) | non_adaptive_traj_favored < log10(1/1.5)) / length(non_adaptive_traj_favored)
## [1] 0.7324955

Prediction of relevant trajectories

We construct thee plotting_d tibble which has information about the observed effect, linear model effect which is simply ccalled epistatic effect and the effect of each order of the WT-background model

adaptive_traj_fave = adaptive_traj %>% filter(likely)

###

plotting_d = tibble()
dummy2 = data.frame()

for(i in 1:length(adaptive_trajectories)) {
  
  suffix = tail(str_split(adaptive_trajectories[i], "_")[[1]], 1)
  
  file = paste0("pred_df_", suffix, "_", adaptive_trajectories[i], ".csv")
  
  d = read_csv(paste0("Output/", adaptive_trajectories[i], "/", file), show_col_types=F)
  
  genos = str_replace_all(adaptive_traj_fave[which(adaptive_traj_fave$unique_id %in% adaptive_trajectories[i]),]$genotype, "x", "1")
  
  genos = c(str_replace_all(genos[1], "1", "0"), genos)
  
  dummy2 = rbind(dummy2, data.frame(trajectory = adaptive_trajectories[i], Z = d[which(d$numbers == tail(genos, 1)),]$`observed effect`))
  
  others = c()
  for(j in 1:(length(genos) - 2)) {
    others = c(others, colnames(d)[which(colnames(d) == "mutations") + j]) ## Get all steps before final step of evaluated genotype
  }
  
  plotting_d = rbind(plotting_d, d[which(d$numbers %in% genos),] %>%
    pivot_longer(cols = c(`observed effect`,
                          `epistatic effect`,
                          `WT effect`,
                          others)) %>% ## add those steps into line plots
    select(c(mutations, name, value)) %>%
    mutate(trajectory = adaptive_trajectories[i]))

}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(others)
## 
##   # Now:
##   data %>% select(all_of(others))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Next we plot all of those

ggplot(plotting_d, aes(x = mutations, y = value, color = name)) +
  geom_line(lwd = 2) +
  facet_wrap(~ trajectory) +
  geom_hline(data = dummy2, aes(yintercept = Z)) +
  geom_hline(data = dummy2, aes(yintercept = Z + log10(1.5)), lty = 2) +
  geom_hline(data = dummy2, aes(yintercept = Z - log10(1.5)), lty = 2) +
  geom_hline(data = dummy2, aes(yintercept = Z + log10(10)), lty = 2, color = "grey") +
  geom_hline(data = dummy2, aes(yintercept = Z - log10(10)), lty = 2, color = "grey") +
  theme_classic()

Also return as a table

plotting_d %>%
  pivot_wider(names_from = name, values_from = value) %>%
  knitr::kable()
mutations trajectory observed effect epistatic effect WT effect 2 step 3 step 4 step final step 5 step 6 step
0 DHFR_ki_trajg 0.0000000 -0.0124266 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 NA NA
1 DHFR_ki_trajg 1.1303338 1.1421598 1.1303338 1.1303338 1.1303338 1.1303338 1.1303338 NA NA
2 DHFR_ki_trajg 2.3598355 2.3480971 1.7504698 2.3598355 2.3598355 2.3598355 2.3598355 NA NA
3 DHFR_ki_trajg 3.3541084 3.3664861 2.0928925 3.3782593 3.3541084 3.3541084 3.3541084 NA NA
4 DHFR_ki_trajg 4.5820634 4.5691010 2.4392455 6.4837389 4.2208458 4.5820634 4.5820634 NA NA
5 DHFR_ki_trajg 5.9656720 5.9782807 2.7914280 9.0023947 3.9479878 6.3680742 5.9656720 NA NA
0 DHFR_ki_trajr 0.0000000 -0.0155046 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 NA NA
1 DHFR_ki_trajr 1.1303338 1.1452378 1.1303338 1.1303338 1.1303338 1.1303338 1.1303338 NA NA
2 DHFR_ki_trajr 2.3598355 2.3450191 1.7504698 2.3598355 2.3598355 2.3598355 2.3598355 NA NA
3 DHFR_ki_trajr 3.3541084 3.3695641 2.0928925 3.3782593 3.3541084 3.3541084 3.3541084 NA NA
4 DHFR_ki_trajr 4.2810334 4.2664918 2.4450750 5.1111507 4.4545256 4.2810334 4.2810334 NA NA
5 DHFR_ki_trajr 5.2304489 5.2453065 2.8049105 7.8594519 4.0646531 5.7290199 5.2304489 NA NA
0 MPH_catact_ZnPTM 0.0000000 -0.0016066 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 NA NA
1 MPH_catact_ZnPTM 0.9943172 0.9914272 0.9943172 0.9943172 0.9943172 0.9943172 0.9943172 NA NA
2 MPH_catact_ZnPTM 1.2068259 1.2012842 1.2661588 1.2068259 1.2068259 1.2068259 1.2068259 NA NA
3 MPH_catact_ZnPTM 1.2405492 1.2353742 1.9345447 1.2404054 1.2405492 1.2405492 1.2405492 NA NA
4 MPH_catact_ZnPTM 2.3117539 2.3103613 2.8654937 0.4136588 2.0807635 2.3117539 2.3117539 NA NA
5 MPH_catact_ZnPTM 2.9609462 2.9605514 2.0776813 -0.4173805 2.1149224 3.6596868 2.9609462 NA NA
0 NfsA_ec50_2039 0.0000000 -0.0052604 0.0000000 0.0000000 0.0000000 0.0000000 NA 0.0000000 NA
1 NfsA_ec50_2039 0.4216039 0.4174507 0.4216039 0.4216039 0.4216039 0.4216039 NA 0.4216039 NA
2 NfsA_ec50_2039 0.6190933 0.6108143 0.4785088 0.6190933 0.6190933 0.6190933 NA 0.6190933 NA
3 NfsA_ec50_2039 0.7450748 0.7388571 0.4244695 0.7362056 0.7450748 0.7450748 NA 0.7450748 NA
4 NfsA_ec50_2039 0.8488047 0.8477300 0.4121357 0.7618568 0.7740661 0.8488047 NA 0.8488047 NA
5 NfsA_ec50_2039 0.9845273 0.9797227 0.3801515 -0.1849312 1.2563700 1.2151156 NA 0.9845273 NA
0 NfsA_ec50_3637 0.0000000 -0.0052604 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1 NfsA_ec50_3637 0.4800069 0.4741952 0.4800069 0.4800069 0.4800069 0.4800069 0.4800069 0.4800069 0.4800069
2 NfsA_ec50_3637 0.6711728 0.6693158 0.5369118 0.6711728 0.6711728 0.6711728 0.6711728 0.6711728 0.6711728
3 NfsA_ec50_3637 0.8413595 0.8383239 0.5245781 0.6711856 0.8413595 0.8413595 0.8413595 0.8413595 0.8413595
4 NfsA_ec50_3637 0.9180303 0.9137495 0.4462645 0.4576674 1.4626969 0.9180303 0.9180303 0.9180303 0.9180303
5 NfsA_ec50_3637 0.9566486 0.9519493 0.1443651 0.8355329 1.8320479 0.0426191 0.9566486 0.9566486 0.9566486
6 NfsA_ec50_3637 0.9454686 0.9436716 0.0727609 0.6151170 2.4772843 -0.4757923 0.9454686 1.6847688 0.9454686
7 NfsA_ec50_3637 1.0043214 1.0028427 0.0939502 1.2003309 1.7113959 -1.2265483 1.0043214 4.5060330 -0.9087341
0 OXA-48_ic50_CAZtraj1 0.0000000 -0.0026570 0.0000000 0.0000000 0.0000000 NA 0.0000000 NA NA
1 OXA-48_ic50_CAZtraj1 0.3560259 0.3488146 0.3560259 0.3560259 0.3560259 NA 0.3560259 NA NA
2 OXA-48_ic50_CAZtraj1 1.1398791 1.1223656 0.3360292 1.1398791 1.1398791 NA 1.1398791 NA NA
3 OXA-48_ic50_CAZtraj1 1.4842998 1.4833406 0.4115762 2.0345134 1.4842998 NA 1.4842998 NA NA
4 OXA-48_ic50_CAZtraj1 1.6042261 1.5994638 0.5255195 1.9259530 1.3318026 NA 1.6042261 NA NA
0 OXA-48_ic50_CAZtraj2 0.0000000 -0.0026570 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 NA
1 OXA-48_ic50_CAZtraj2 0.3856063 0.3839493 0.3856063 0.3856063 0.3856063 0.3856063 0.3856063 0.3856063 NA
2 OXA-48_ic50_CAZtraj2 0.8027737 0.7969589 0.7694216 0.8027737 0.8027737 0.8027737 0.8027737 0.8027737 NA
3 OXA-48_ic50_CAZtraj2 1.0934217 1.0916060 0.9093007 1.4373797 1.0934217 1.0934217 1.0934217 1.0934217 NA
4 OXA-48_ic50_CAZtraj2 1.5276299 1.5256020 0.9005268 2.1488613 1.1459227 1.5276299 1.5276299 1.5276299 NA
5 OXA-48_ic50_CAZtraj2 1.5682017 1.5672424 0.8881931 2.2688061 1.6984605 1.6998067 1.5682017 1.5682017 NA
6 OXA-48_ic50_CAZtraj2 1.5078559 1.5046215 0.8967932 1.9530868 0.7115869 2.2277266 1.5078559 1.2706843 NA
0 OXA-48_ic50_CAZtraj3 0.0000000 -0.0026570 0.0000000 0.0000000 0.0000000 0.0000000 NA NA NA
1 OXA-48_ic50_CAZtraj3 0.4329693 0.4292205 0.4329693 0.4329693 0.4329693 0.4329693 NA NA NA
2 OXA-48_ic50_CAZtraj3 0.9938769 0.9737267 0.4898741 0.9938769 0.9938769 0.9938769 NA NA NA
3 OXA-48_ic50_CAZtraj3 1.3364597 1.3340956 0.4941955 1.1216466 1.3364597 1.3364597 NA NA NA
4 OXA-48_ic50_CAZtraj3 1.4149733 1.4081219 0.5355882 0.9184096 1.6022940 1.4149733 NA NA NA
0 PTE_catact_2NH 0.0000000 -0.0030856 0.0000000 0.0000000 0.0000000 0.0000000 NA NA NA
1 PTE_catact_2NH 1.4199557 1.4175792 1.4199557 1.4199557 1.4199557 1.4199557 NA NA NA
2 PTE_catact_2NH 2.2095150 2.2072984 2.2408137 2.2095150 2.2095150 2.2095150 NA NA NA
3 PTE_catact_2NH 3.0211893 3.0194333 2.8546556 4.0243530 3.0211893 3.0211893 NA NA NA
4 PTE_catact_2NH 3.2405492 3.2383995 2.6608355 4.1859390 2.3618565 3.2405492 NA NA NA
0 TEM_MIC_weinreich 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 NA NA
1 TEM_MIC_weinreich 1.2013971 1.2016454 1.2013971 1.2013971 1.2013971 1.2013971 1.2013971 NA NA
2 TEM_MIC_weinreich 3.6117233 3.6118198 1.3803741 3.6117233 3.6117233 3.6117233 3.6117233 NA NA
3 TEM_MIC_weinreich 4.3783979 4.3777366 1.3803741 5.8493869 4.3783979 4.3783979 4.3783979 NA NA
4 TEM_MIC_weinreich 4.5185139 4.5179153 1.2893949 8.6829825 2.5918364 4.5185139 4.5185139 NA NA
5 TEM_MIC_weinreich 4.6683859 4.6683012 1.2893949 8.9067386 3.4005435 4.3989125 4.6683859 NA NA
0 DHFR_ic75_palmer 0.0000000 -0.0177247 0.0000000 0.0000000 0.0000000 NA NA NA NA
1 DHFR_ic75_palmer 1.8555192 1.8548586 1.8555192 1.8555192 1.8555192 NA NA NA NA
2 DHFR_ic75_palmer 2.1931246 2.1911919 2.1630152 2.1931246 2.1931246 NA NA NA NA
3 DHFR_ic75_palmer 2.3424227 2.3418586 2.8064679 2.4667779 2.3424227 NA NA NA NA

Then we make a figure tracking error (visual of supplementary table 6)

plotting_d_supp_4 = plotting_d %>%
  mutate(name = str_replace(name, "WT effect", "1 step")) %>%
  filter(name != "epistatic effect") %>%
  filter(name != "final step") %>%
  pivot_wider(names_from = "name",
              values_from = "value") %>%
  pivot_longer(cols = `1 step`:`6 step`) %>%
  drop_na() %>%
  mutate(value = abs(value - `observed effect`)) %>%
  filter(mutations > as.numeric(str_sub(name, 1, 1)))

supp_4_plot = plotting_d_supp_4 %>%
  mutate(`Model Order` = str_replace(name, " step", "th Order")) %>%
  ggplot(aes(x = mutations, y = value, color = `Model Order`)) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 0 + log10(1.5), lty = 2) +
  geom_point() +
  geom_line(lwd = 0.5) +
  facet_wrap(~ trajectory) +
  #geom_hline(data = dummy2, aes(yintercept = Z - log10(1.5)), lty = 2) +
  #geom_hline(data = dummy2, aes(yintercept = Z + log10(10)), lty = 2, color = "grey") +
  #geom_hline(data = dummy2, aes(yintercept = Z - log10(10)), lty = 2, color = "grey") +
  theme_classic() +
  labs(x = "Predicted mutant order", 
       y = expression("Absolute error of predicted"*italic(" F"))) +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))

#ggsave("fig_3D.svg", supp_4_plot, width = 180, height = 247/3, dpi = 300, units = "mm")

Ruggedness plots

I can make a ruggedness plot for each adaptive landscape, where deviation from 0 implies epistatic contribution of that genotype

adaptive_traj %>% 
  filter(mutations > 1 & likely) %>%
  ggplot(aes(x = mutations, y = avg)) +
  geom_line() +
  geom_text_repel(aes(label = genotype), force = 1.5, direction = "y", box.padding = 0, segment.size = 0.2, size = 2.5) +
  geom_hline(yintercept = log10(1.5), lty = 2) +
  geom_hline(yintercept = log10(1/1.5), lty = 2) +
  geom_hline(yintercept = 0) +
  facet_wrap(~ unique_id) +
  theme_classic()

What if we compare it to the mean at that combination or position as well, not just assumed additivity (0 +/- log10(1.5))?

higher_means = higher_df %>%
  unite("partial_id", c(Enzyme, Measurement, Condition), remove = F) %>%
  filter(partial_id %in% adaptive_trajectories) %>% ## use partial ID to only pick up the adaptive trajectories
  group_by(unique_id) %>%
  mutate(avg = mean(avg)) %>% ## Is arithmetic mean OK here?
  ungroup() %>%
  distinct(unique_id, .keep_all = T) ## Remove unique_id replicates

adaptive_means = higher_means %>% arrange(partial_id, pos) %>% pull(avg)

adaptive_traj = adaptive_traj %>% arrange(unique_id, pos) %>% mutate(means = adaptive_means)

Then we make the plot

trajectory_labeller = function(variable,value){
  return(trajectory_names[value])
}

trajectory_names = list(
  "DHFR_ic75_palmer" = expression("DHFR Palmer"*italic(" et al.")),
  "DHFR_ki_trajg" = expression("DHFR Traj. G"),
  "DHFR_ki_trajr" = expression("DHFR Traj. R"),
  "MPH_catact_ZnPTM" = expression("MPH Traj. Zn"),
  "NfsA_ec50_2039" = expression("NfsA Traj. 20_39"),
  "NfsA_ec50_3637" = expression("NfsA Traj. 36_37"),
  "OXA-48_ic50_CAZtraj1" = expression("OXA-48 CAZ Traj. 1"),
  "OXA-48_ic50_CAZtraj2" = expression("OXA-48 CAZ Traj. 2"),
  "OXA-48_ic50_CAZtraj3" = expression("OXA-48 CAZ Traj. 3"),
  "PTE_catact_2NH" = expression("PTE Traj. 2NH"),
  "TEM_MIC_weinreich" = expression("TEM Weinreich"*italic(" et al."))
)

adaptive_traj_plot = adaptive_traj %>%
  filter(likely & mutations > 1) %>%
  mutate(wt_bg = avg) %>%
  pivot_longer(c(wt_bg, means)) %>%
  ggplot(aes(x = factor(mutations), y = value, fill = name)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_hline(yintercept = log10(1.5), lty = 2, size = 0.2) +
  geom_hline(yintercept = log10(1/1.5), lty = 2, size = 0.2) +
  geom_hline(yintercept = 0) +
    scale_color_manual(values = c("#edae49", "#4988ed")) +
  facet_wrap(~ unique_id, 
             scales = "free",
             labeller = trajectory_labeller) +
  labs(x = "Mutational Step", 
       y = expression(epsilon)) +
  theme_classic() +
  theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
        legend.position = "none", axis.line=element_line()) +
  scale_fill_manual(values = c("#4988ed", "#edae49")) +
  scale_y_continuous(limits=c(-3,3))
## Warning: The `labeller` API has been updated. Labellers taking `variable` and `value`
## arguments are now deprecated.
## ℹ See labellers documentation.
adaptive_traj_plot

#ggsave("fig_4.svg", adaptive_traj_plot, width = 180, height = 247/2, dpi = 300, units = "mm")

Which one of the genotypes seen above are the outliers?

adaptive_traj %>%
  filter(likely & mutations > 1) %>%
  mutate(outlier = abs(means - avg) > log10(1.5)) %>%
  select(c(genotype, unique_id)) %>%
  knitr::kable()
genotype unique_id
x0xx00 DHFR_ic75_palmer
00xx00 DHFR_ic75_palmer
xxxxx DHFR_ki_trajg
0xxxx DHFR_ki_trajg
0xx0x DHFR_ki_trajg
00x0x DHFR_ki_trajg
xxxxx DHFR_ki_trajr
xxx0x DHFR_ki_trajr
0xx0x DHFR_ki_trajr
00x0x DHFR_ki_trajr
0xx00 MPH_catact_ZnPTM
0xx0x MPH_catact_ZnPTM
xxxxx MPH_catact_ZnPTM
xxx0x MPH_catact_ZnPTM
xxx00xx NfsA_ec50_2039
x0x00xx NfsA_ec50_2039
x0000x0 NfsA_ec50_2039
x0000xx NfsA_ec50_2039
xxxxxxx NfsA_ec50_3637
xxx0xxx NfsA_ec50_3637
x0x0xxx NfsA_ec50_3637
x0x00x0 NfsA_ec50_3637
x0x00xx NfsA_ec50_3637
x0000x0 NfsA_ec50_3637
xxxx OXA-48_ic50_CAZtraj1
0xxx OXA-48_ic50_CAZtraj1
0x0x OXA-48_ic50_CAZtraj1
00x0x0 OXA-48_ic50_CAZtraj2
00x0xx OXA-48_ic50_CAZtraj2
xxxxxx OXA-48_ic50_CAZtraj2
0xxxxx OXA-48_ic50_CAZtraj2
0xx0xx OXA-48_ic50_CAZtraj2
xx0x00 OXA-48_ic50_CAZtraj3
xx0x0x OXA-48_ic50_CAZtraj3
x00x00 OXA-48_ic50_CAZtraj3
xxx0x0 PTE_catact_2NH
xx00x0 PTE_catact_2NH
0x00x0 PTE_catact_2NH
00x0x TEM_MIC_weinreich
xxxxx TEM_MIC_weinreich
0xxxx TEM_MIC_weinreich
0xx0x TEM_MIC_weinreich

Now we swich gears to look at the entire adaptive landscapes, not just the most accessible trajectory.

What % of adaptive landscape genotypes are outliers from the mean? What % of adaptive accessible trajectory genotypes are outliers from the mean? What % of adaptive less accessible trajectory genotypes are outliers from the mean?

adaptive_traj_final = adaptive_traj %>%
  mutate(outlier = abs(means - avg) > log10(1.5))

length(adaptive_traj_final$outlier)
## [1] 645
sum(adaptive_traj_final$outlier)
## [1] 309
sum(adaptive_traj_final$outlier) / length(adaptive_traj_final$outlier)
## [1] 0.4790698
length(filter(adaptive_traj_final, likely) %>% pull(outlier))
## [1] 53
sum(filter(adaptive_traj_final, likely) %>% pull(outlier))
## [1] 30
sum(filter(adaptive_traj_final, likely) %>% pull(outlier)) / length(filter(adaptive_traj_final, likely) %>% pull(outlier))
## [1] 0.5660377
length(filter(adaptive_traj_final, !likely) %>% pull(outlier))
## [1] 592
sum(filter(adaptive_traj_final, !likely) %>% pull(outlier))
## [1] 279
sum(filter(adaptive_traj_final, !likely) %>% pull(outlier)) / length(filter(adaptive_traj_final, !likely) %>% pull(outlier))
## [1] 0.4712838

Diminishing Returns

Next we try to see if adaptive vs random trajectories show diminishing returns.

Doing for single mutants first for every single landscape…

higher_df_diminish = higher_df %>%
  unite("partial_id", c(Enzyme, Measurement, Condition)) %>%
  filter(mutations == 1) %>%
  mutate(from = str_replace_all(genotype, "x", "0"),
         to = str_replace_all(genotype, "x", "1"))

start_effect = c()
for(i in 1:dim(higher_df_diminish)[1]) {
  
  if(!str_detect(higher_df_diminish$from[i], "1")) {
    start_effect = c(start_effect, 0) # If starts from WT, start effect is 0
  } else {
    start_effect = c(start_effect, filter(fit_land, unique_id == higher_df_diminish$partial_id[i] & id == higher_df_diminish$from[i]) 
  %>% pull(effect))
  }
}

higher_df_diminish$start_effect = start_effect

ggplot(higher_df_diminish, aes(x = start_effect, y = avg)) +
  geom_point() +
  theme_classic()

Next only looking at adaptive landscapes

higher_df_diminish = higher_df %>%
  unite("partial_id", c(Enzyme, Measurement, Condition)) %>%
  filter(mutations == 1) %>%
  mutate(from = str_replace_all(genotype, "x", "0"),
         to = str_replace_all(genotype, "x", "1")) %>%
  filter(partial_id %in% adaptive_trajectories)

start_effect = c()
for(i in 1:dim(higher_df_diminish)[1]) {
  
  if(!str_detect(higher_df_diminish$from[i], "1")) {
    start_effect = c(start_effect, 0) # If starts from WT, start effect is 0
  } else {
    start_effect = c(start_effect, filter(fit_land, unique_id == higher_df_diminish$partial_id[i] & id == higher_df_diminish$from[i]) 
  %>% pull(effect))
  }
}

higher_df_diminish$start_effect = start_effect

ggplot(higher_df_diminish, aes(x = start_effect, y = avg)) +
  geom_point() +
  facet_wrap(~ partial_id) +
  geom_smooth(method = 'lm', formula = y ~ x) +
  stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~~")), color = "red", geom = "label", label.x = -2.5, label.y = 4, r.accuracy = 0.01, p.accuracy = 0.01) +
  theme_classic() 
## Warning: The dot-dot notation (`..rr.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(rr.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

What about only looking at the best step from each starting genotype?

higher_df_diminish %>%
  group_by(partial_id, from) %>%
  summarise(avg = max(avg), start_effect = start_effect) %>%
  distinct() %>%
  ggplot(aes(x = start_effect, y = avg)) +
  geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x) +
  stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~~")), color = "red", geom = "label", label.x = -2, label.y = 4.7, r.accuracy = 0.01, p.accuracy = 0.01) +
  facet_wrap(~ partial_id) +
  theme_classic()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'partial_id', 'from'. You can override
## using the `.groups` argument.

What about only looking at the adaptive trajectories?

all_traj_points = all_traj %>% 
  filter(likely & unique_id %in% adaptive_trajectories) %>%
  mutate(genotype = str_replace_all(genotype, "x", "1")) %>%
  select(genotype, unique_id)


higher_df_diminish_traj = tibble()
prev_geno = c()
for(i in 1:dim(all_traj_points)[1]) {
 
  if(length(prev_geno) > 0) {
    higher_df_diminish_traj = rbind(higher_df_diminish_traj, higher_df_diminish %>% filter(partial_id == all_traj_points$unique_id[i] & to == all_traj_points$genotype[i]) %>% filter(from == prev_geno)) 
  } else {
    higher_df_diminish_traj = rbind(higher_df_diminish_traj, higher_df_diminish %>% filter(partial_id == all_traj_points$unique_id[i] & to == all_traj_points$genotype[i]))
  }
    
  if(i != dim(all_traj_points)[1]) { 
    if(all_traj_points$unique_id[i + 1] == all_traj_points$unique_id[i]) {
      prev_geno = tail(higher_df_diminish_traj$to, 1)
    } else {
      prev_geno = c()
    }
  }

}

higher_df_diminish_traj %>%
  ggplot(aes(x = start_effect, y = avg)) +
  geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x) +
  stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~~")), color = "red", geom = "label", label.x = 0, label.y = 3.7, r.accuracy = 0.01, p.accuracy = 0.01) +
  facet_wrap(~ partial_id) +
  theme_classic()

We can also do a side by side correlation comparison of starting effect vs MAXIMUM delFs as well as vs ALL POSSIBLE delFs

max_list = higher_df_diminish %>%
  group_by(partial_id, from) %>%
  summarise(avg = max(avg), start_effect = start_effect) %>%
  distinct()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'partial_id', 'from'. You can override
## using the `.groups` argument.
maxes = c()
for(entry in 1:dim(higher_df_diminish)[1]) {
  maxes = c(maxes, ifelse(any(higher_df_diminish[entry,]$from == max_list$from & higher_df_diminish[entry,]$partial_id == max_list$partial_id & higher_df_diminish[entry,]$avg == max_list$avg & higher_df_diminish[entry,]$start_effect == max_list$start_effect), T, F))
}

higher_df_diminish$is_max = maxes

higher_df_diminish_plot = higher_df_diminish %>%
  ggplot(aes(x = start_effect, y = avg)) +
  geom_point(aes(fill = is_max), shape = 21, color = "black", stroke = 0.1) +
  geom_smooth(method = 'lm', formula = y ~ x, color = "grey") +
  geom_smooth(data = higher_df_diminish %>% filter(is_max), method = 'lm', formula = y ~ x, color = "#edae49") +
  stat_cor(aes(label = paste(..rr.label.., sep = "~~")), label.x = 2, label.y = 5.5, r.accuracy = 0.01, size = 2.5, color = "grey") +
  stat_cor(data = higher_df_diminish %>% filter(is_max), aes(label = paste(..rr.label.., sep = "~~")), label.x = 2, label.y = 4, r.accuracy = 0.01, size = 2.5, color = "#edae49") +
  facet_wrap(~ partial_id, scales = "free") +
  labs(x = expression("Starting Function ("*italic("F")*")"), 
       y = expression(Delta*'F')) +
  theme_classic() +
  theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
        legend.position = "none", axis.line=element_line()) +
  scale_fill_manual(values = c("grey", "#edae49")) +
  scale_x_continuous(limits=c(-6,6)) + 
  scale_y_continuous(limits=c(-6,6))

higher_df_diminish_plot

#ggsave("supp_fig_3.svg", higher_df_diminish_plot, width = 180, height = 247/2, dpi = 300, units = "mm")

Other Exploratory Work

Modify adaptive traj to also have the genotypic effect from fit_land. To do this I need to take the unique_id from adaptive traj and the genotype after x -> 1 modification from adaptive traj, and find the corresponding value of the genotype in fit_land

traj_effects = c()
for(i in 1:dim(adaptive_traj)[1]){
  
  traj_effects= c(traj_effects, fit_land %>%
    filter(unique_id == adaptive_traj$unique_id[i]) %>%
    filter(id == str_replace_all(adaptive_traj$genotype[i], "x", "1")) %>%
    pull(effect))
  
}

adaptive_traj$traj_effects = traj_effects

Now that adaptive_traj has genotype effects, we can ask the question: Did the idiosyncratic epistasis at that order specifically enable or restrict that genotype.

access = c()
access_no_idio = c()
access_mean = c()

for(i in 1:dim(adaptive_traj)[1]) {
  
  current_test = adaptive_traj[i,]
  
  if(current_test$mutations != 1) {
    loc = str_locate_all(current_test$genotype, "x")[[1]][,1]
    len = str_length(current_test$genotype)
    
    func = c()
    
    # Get all previous constituents
    for(j in 1:length(loc)) {
      const = current_test$genotype
      substr(const, loc[j], loc[j]) = "0"
      
      func = c(func, adaptive_traj %>% filter(unique_id == current_test$unique_id & genotype == const) %>% pull(traj_effects))
    }
  
    access = c(access, sum(current_test$traj_effects - func >= 0))
    access_no_idio = c(access_no_idio, sum((current_test$traj_effects - current_test$avg) - func >= 0)) # Access without epistasis
    access_mean = c(access_mean, sum((current_test$traj_effects - (current_test$avg - current_test$means)) - func >= 0))
  } else {
    access = c(access, NA)
    access_no_idio = c(access_no_idio, NA)
    access_mean = c(access_mean, NA)
  }
}

Now let’s look at the accessibility metrics in a tibble

access_df = tibble(access = access,
                   access_no_idio = access_no_idio,
                   access_mean = access_mean)

access_df = access_df %>% mutate(access_w_mean = access_mean - access,
                     access_w_idio = access - access_no_idio)

adaptive_traj_access = cbind(adaptive_traj, access_df)

head(adaptive_traj_access)
##   genotype         avg                    pos mutations likely        unique_id
## 1   x00000  0.64345268                     p1         1  FALSE DHFR_ic75_palmer
## 2   xx0000  0.01200891                 p1|p21         2  FALSE DHFR_ic75_palmer
## 3   xxx000  4.09150055             p1|p21|p26         3  FALSE DHFR_ic75_palmer
## 4   xxxx00 -3.44007396         p1|p21|p26|p28         4  FALSE DHFR_ic75_palmer
## 5   xxxxx0  0.21273191     p1|p21|p26|p28|p30         5  FALSE DHFR_ic75_palmer
## 6   xxxxxx -7.18913559 p1|p21|p26|p28|p30|p94         6  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond       means traj_effects access
## 1        DHFR         ic75 palmer  1.20623549    0.6434527     NA
## 2        DHFR         ic75 palmer  0.82391077    1.3838154      2
## 3        DHFR         ic75 palmer  0.21721618    1.9106244      3
## 4        DHFR         ic75 palmer  0.08693999    2.3242825      3
## 5        DHFR         ic75 palmer -3.38183588    2.1038037      0
## 6        DHFR         ic75 palmer -7.18913559    2.1583625      2
##   access_no_idio access_mean access_w_mean access_w_idio
## 1             NA          NA            NA            NA
## 2              2           2             0             0
## 3              1           1            -2             2
## 4              4           4             1            -1
## 5              0           0             0             0
## 6              6           2             0            -4

What does accessibility look like in adaptive trajectories only?

head(adaptive_traj_access %>%
  filter(likely))
##   genotype         avg                 pos mutations likely        unique_id
## 1   x0xx00 -0.12435525          p1|p26|p28         3   TRUE DHFR_ic75_palmer
## 2   00xx00  0.03010940             p26|p28         2   TRUE DHFR_ic75_palmer
## 3   000x00  1.85551916                 p28         1   TRUE DHFR_ic75_palmer
## 4    xxxxx -0.40240226 p21|p26|p28|p30|p94         5   TRUE    DHFR_ki_trajg
## 5    0xxxx  0.36121760     p26|p28|p30|p94         4   TRUE    DHFR_ki_trajg
## 6    0xx0x -0.02415083         p26|p28|p94         3   TRUE    DHFR_ki_trajg
##   enzyme_name measure_type   cond       means traj_effects access
## 1        DHFR         ic75 palmer -0.37628762     2.342423      3
## 2        DHFR         ic75 palmer  0.51316879     2.193125      2
## 3        DHFR         ic75 palmer  0.02486007     1.855519     NA
## 4        DHFR           ki  trajg -0.40240226     5.965672      5
## 5        DHFR           ki  trajg  0.16001647     4.582063      4
## 6        DHFR           ki  trajg -0.03088869     3.354108      3
##   access_no_idio access_mean access_w_mean access_w_idio
## 1              3           2            -1             0
## 2              2           2             0             0
## 3             NA          NA            NA            NA
## 4              5           5             0             0
## 5              4           4             0             0
## 6              3           3             0             0

And again for non-adaptive trajectories

head(adaptive_traj_access %>%
  filter(!likely))
##   genotype         avg                    pos mutations likely        unique_id
## 1   x00000  0.64345268                     p1         1  FALSE DHFR_ic75_palmer
## 2   xx0000  0.01200891                 p1|p21         2  FALSE DHFR_ic75_palmer
## 3   xxx000  4.09150055             p1|p21|p26         3  FALSE DHFR_ic75_palmer
## 4   xxxx00 -3.44007396         p1|p21|p26|p28         4  FALSE DHFR_ic75_palmer
## 5   xxxxx0  0.21273191     p1|p21|p26|p28|p30         5  FALSE DHFR_ic75_palmer
## 6   xxxxxx -7.18913559 p1|p21|p26|p28|p30|p94         6  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond       means traj_effects access
## 1        DHFR         ic75 palmer  1.20623549    0.6434527     NA
## 2        DHFR         ic75 palmer  0.82391077    1.3838154      2
## 3        DHFR         ic75 palmer  0.21721618    1.9106244      3
## 4        DHFR         ic75 palmer  0.08693999    2.3242825      3
## 5        DHFR         ic75 palmer -3.38183588    2.1038037      0
## 6        DHFR         ic75 palmer -7.18913559    2.1583625      2
##   access_no_idio access_mean access_w_mean access_w_idio
## 1             NA          NA            NA            NA
## 2              2           2             0             0
## 3              1           1            -2             2
## 4              4           4             1            -1
## 5              0           0             0             0
## 6              6           2             0            -4

Now I can ask a variety of questions with this dataset. For example, lets see where epistasis changes node visitation

dim(adaptive_traj_access %>%
  drop_na() %>%
  filter(access != access_no_idio))[1] / dim(adaptive_traj_access %>% drop_na())[1]
## [1] 0.4939966
paste0(dim(adaptive_traj_access %>%
  drop_na() %>%
  filter(access != access_no_idio))[1], "/", dim(adaptive_traj_access %>% drop_na())[1])
## [1] "288/583"

How often does epistasis changes node visitation for most likely trajectories

dim(adaptive_traj_access %>%
  filter(likely) %>%
  drop_na() %>%
  filter(access != access_no_idio))[1] / dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1]
## [1] 0.2619048
paste0(dim(adaptive_traj_access %>%
  filter(likely) %>%
  drop_na() %>%
  filter(access != access_no_idio))[1], "/", dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1])
## [1] "11/42"

Where is idiosyncratic visitation different from the mean WHEN epistasis affects node visitation?

dim(adaptive_traj_access %>%
  drop_na() %>%
  filter(access != access_mean & access != access_no_idio))[1] / dim(adaptive_traj_access %>% filter(access != access_no_idio) %>% drop_na())[1]
## [1] 0.7326389
paste0(dim(adaptive_traj_access %>%
  drop_na() %>%
  filter(access != access_mean & access != access_no_idio))[1], "/", dim(adaptive_traj_access %>% filter(access != access_no_idio) %>% drop_na())[1])
## [1] "211/288"

Where is idiosyncratic visitation different from the mean in the likely traj?

dim(adaptive_traj_access %>%
  filter(likely) %>%
  drop_na() %>%
  filter(access != access_mean))[1] / dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1]
## [1] 0.1904762
paste0(dim(adaptive_traj_access %>%
  filter(likely) %>%
  drop_na() %>%
  filter(access != access_mean))[1], "/", dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1])
## [1] "8/42"

Restrictive idiosyncrasy: When negative epistasis prevents node visitation AND mean epistasis would have allowed for it

head(adaptive_traj_access %>%
  drop_na() %>%
  filter(avg < log10(1/1.5) & access_w_idio < 0 & access_w_mean > 0 & means > 0))
##   genotype        avg            pos mutations likely        unique_id
## 1   xxxx00 -3.4400740 p1|p21|p26|p28         4  FALSE DHFR_ic75_palmer
## 2   x0xx0x -1.1042305 p1|p26|p28|p94         4  FALSE DHFR_ic75_palmer
## 3   0xx000 -4.0673669        p21|p26         2  FALSE DHFR_ic75_palmer
## 4   0x0xx0 -1.6714200    p21|p28|p30         3  FALSE DHFR_ic75_palmer
## 5   0x00x0 -0.2888077        p21|p30         2  FALSE DHFR_ic75_palmer
## 6   0x00xx -1.5696224    p21|p30|p94         3  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond      means traj_effects access access_no_idio
## 1        DHFR         ic75 palmer 0.08693999    2.3242825      3              4
## 2        DHFR         ic75 palmer 2.57320556    2.3263359      3              4
## 3        DHFR         ic75 palmer 0.53515465   -3.0315171      0              2
## 4        DHFR         ic75 palmer 0.76025803   -0.8041003      0              1
## 5        DHFR         ic75 palmer 0.29888333    0.7067178      1              2
## 6        DHFR         ic75 palmer 0.27945161   -0.5436340      0              2
##   access_mean access_w_mean access_w_idio
## 1           4             1            -1
## 2           4             1            -1
## 3           2             2            -2
## 4           2             2            -1
## 5           2             1            -1
## 6           2             2            -2

What about positive epistasis that does not affect or increases visitation?

head(adaptive_traj_access %>%
  drop_na() %>%
  filter(avg > log10(1.5) & access_w_idio >= 0))
##   genotype        avg                pos mutations likely        unique_id
## 1   xxx000  4.0915006         p1|p21|p26         3  FALSE DHFR_ic75_palmer
## 2   xxxxx0  0.2127319 p1|p21|p26|p28|p30         5  FALSE DHFR_ic75_palmer
## 3   xxxx0x 10.4358638 p1|p21|p26|p28|p94         5  FALSE DHFR_ic75_palmer
## 4   xxx0xx  2.5755328 p1|p21|p26|p30|p94         5  FALSE DHFR_ic75_palmer
## 5   xx0x00  0.7938784         p1|p21|p28         3  FALSE DHFR_ic75_palmer
## 6   xx0xx0  1.9958051     p1|p21|p28|p30         4  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond       means traj_effects access
## 1        DHFR         ic75 palmer  0.21721618     1.910624      3
## 2        DHFR         ic75 palmer -3.38183588     2.103804      0
## 3        DHFR         ic75 palmer  6.84129599     2.294466      3
## 4        DHFR         ic75 palmer -1.01903501     2.313867      5
## 5        DHFR         ic75 palmer  0.01045432     2.071882      3
## 6        DHFR         ic75 palmer  1.86938029     2.184691      3
##   access_no_idio access_mean access_w_mean access_w_idio
## 1              1           1            -2             2
## 2              0           0             0             0
## 3              0           1            -2             3
## 4              0           0            -5             5
## 5              1           1            -2             2
## 6              1           2            -1             2
head(adaptive_traj_access %>%
  drop_na() %>%
  filter(avg > log10(1.5) & access_w_idio > 0))
##   genotype        avg                pos mutations likely        unique_id
## 1   xxx000  4.0915006         p1|p21|p26         3  FALSE DHFR_ic75_palmer
## 2   xxxx0x 10.4358638 p1|p21|p26|p28|p94         5  FALSE DHFR_ic75_palmer
## 3   xxx0xx  2.5755328 p1|p21|p26|p30|p94         5  FALSE DHFR_ic75_palmer
## 4   xx0x00  0.7938784         p1|p21|p28         3  FALSE DHFR_ic75_palmer
## 5   xx0xx0  1.9958051     p1|p21|p28|p30         4  FALSE DHFR_ic75_palmer
## 6   xx0xxx  3.1289862 p1|p21|p28|p30|p94         5  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond       means traj_effects access
## 1        DHFR         ic75 palmer  0.21721618     1.910624      3
## 2        DHFR         ic75 palmer  6.84129599     2.294466      3
## 3        DHFR         ic75 palmer -1.01903501     2.313867      5
## 4        DHFR         ic75 palmer  0.01045432     2.071882      3
## 5        DHFR         ic75 palmer  1.86938029     2.184691      3
## 6        DHFR         ic75 palmer -0.46558155     2.278754      4
##   access_no_idio access_mean access_w_mean access_w_idio
## 1              1           1            -2             2
## 2              0           1            -2             3
## 3              0           0            -5             5
## 4              1           1            -2             2
## 5              1           2            -1             2
## 6              2           2            -2             2

Permissive idiosyncrasy: When positive epistasis enables node visitation AND mean epistasis would have prevented it

head(adaptive_traj_access %>%
  drop_na() %>%
  filter(avg > log10(1.5) & access_w_idio > 0 & access_w_mean < 0 & means < 0))
##   genotype       avg                pos mutations likely        unique_id
## 1   xxx0xx 2.5755328 p1|p21|p26|p30|p94         5  FALSE DHFR_ic75_palmer
## 2   xx0xxx 3.1289862 p1|p21|p28|p30|p94         5  FALSE DHFR_ic75_palmer
## 3   xx000x 1.0635938         p1|p21|p94         3  FALSE DHFR_ic75_palmer
## 4   x0xxx0 0.2566377     p1|p26|p28|p30         4  FALSE DHFR_ic75_palmer
## 5   x0xxxx 0.5135761 p1|p26|p28|p30|p94         5  FALSE DHFR_ic75_palmer
## 6   x0x0xx 0.2071172     p1|p26|p30|p94         4  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond       means traj_effects access
## 1        DHFR         ic75 palmer -1.01903501     2.313867      5
## 2        DHFR         ic75 palmer -0.46558155     2.278754      4
## 3        DHFR         ic75 palmer -0.37347938     1.983175      3
## 4        DHFR         ic75 palmer -1.17749221     2.322219      3
## 5        DHFR         ic75 palmer -3.08099171     2.324282      4
## 6        DHFR         ic75 palmer -0.04561225     2.217484      4
##   access_no_idio access_mean access_w_mean access_w_idio
## 1              0           0            -5             5
## 2              2           2            -2             2
## 3              1           0            -3             2
## 4              0           0            -3             3
## 5              1           0            -4             3
## 6              1           1            -3             3

Even more interesting are the nodes with are either ENTIRELY accessible or inaccessible due to epistasis

head(adaptive_traj_access %>%
  drop_na() %>%
  filter(avg > log10(1.5) & access_no_idio == 0 & access > 0))
##   genotype        avg                pos mutations likely        unique_id
## 1   xxxx0x 10.4358638 p1|p21|p26|p28|p94         5  FALSE DHFR_ic75_palmer
## 2   xxx0xx  2.5755328 p1|p21|p26|p30|p94         5  FALSE DHFR_ic75_palmer
## 3   x0xxx0  0.2566377     p1|p26|p28|p30         4  FALSE DHFR_ic75_palmer
## 4   x00x0x  1.0706822         p1|p28|p94         3  FALSE DHFR_ic75_palmer
## 5   0xx00x  5.0316956        p21|p26|p94         3  FALSE DHFR_ic75_palmer
## 6    x0xxx  1.8737920    p21|p28|p30|p94         4  FALSE    DHFR_ki_trajg
##   enzyme_name measure_type   cond     means traj_effects access access_no_idio
## 1        DHFR         ic75 palmer  6.841296     2.294466      3              0
## 2        DHFR         ic75 palmer -1.019035     2.313867      5              0
## 3        DHFR         ic75 palmer -1.177492     2.322219      3              0
## 4        DHFR         ic75 palmer  1.647842     2.283301      3              0
## 5        DHFR         ic75 palmer  1.695299     1.363612      3              0
## 6        DHFR           ki  trajg  1.672591     4.198657      4              0
##   access_mean access_w_mean access_w_idio
## 1           1            -2             3
## 2           0            -5             5
## 3           0            -3             3
## 4           3             0             3
## 5           1            -2             3
## 6           4             0             4
head(adaptive_traj_access %>%
  drop_na() %>%
  filter(avg < log10(1/1.5) & access_no_idio > 0 & access == 0))
##   genotype       avg             pos mutations likely        unique_id
## 1   xx0x0x -5.214086  p1|p21|p28|p94         4  FALSE DHFR_ic75_palmer
## 2   0xx000 -4.067367         p21|p26         2  FALSE DHFR_ic75_palmer
## 3   0xxx0x -4.725753 p21|p26|p28|p94         4  FALSE DHFR_ic75_palmer
## 4   0x0xx0 -1.671420     p21|p28|p30         3  FALSE DHFR_ic75_palmer
## 5   0x00xx -1.569622     p21|p30|p94         3  FALSE DHFR_ic75_palmer
## 6   000xxx -2.043647     p28|p30|p94         3  FALSE DHFR_ic75_palmer
##   enzyme_name measure_type   cond      means traj_effects access access_no_idio
## 1        DHFR         ic75 palmer -0.2289452   -3.0315171      0              3
## 2        DHFR         ic75 palmer  0.5351546   -3.0315171      0              2
## 3        DHFR         ic75 palmer -0.8278136   -0.4225082      0              4
## 4        DHFR         ic75 palmer  0.7602580   -0.8041003      0              1
## 5        DHFR         ic75 palmer  0.2794516   -0.5436340      0              2
## 6        DHFR         ic75 palmer  0.6800592   -0.1573908      0              2
##   access_mean access_w_mean access_w_idio
## 1           1             1            -3
## 2           2             2            -2
## 3           4             4            -4
## 4           2             2            -1
## 5           2             2            -2
## 6           3             3            -2

Specific epistasis examples, starting with PTE

with_epi = fit_land %>%
  filter(unique_id == "PTE_catact_2NH" & id == "111010") %>%
  pull(effect)

epi = higher_df %>% filter(partial_id == "PTE_catact_2NH" & genotype == "xxx0x0") %>% pull(avg)
mean_epi = adaptive_traj %>% filter(unique_id == "PTE_catact_2NH" & genotype == "xxx0x0") %>% pull(means)

wo_epi = with_epi - epi
w_mean_epi = with_epi - epi + mean_epi

fig_4_new = fit_land %>%
  filter(unique_id == "PTE_catact_2NH") %>%
  filter(id == "000010" | id == "010010" | id == "110010" | id == "111010" | id == "111110" | id == "111111") %>%
  add_row(id = "000000", 
          effect = 0, 
          muts = 0, 
          junk1 = NA, 
          junk2 = NA, 
          enz = "PTE", 
          unique_id = "PTE_catact_2NH",
          .before = 1) %>%
  add_row(id = "alt", 
          effect = wo_epi, 
          muts = 4, 
          junk1 = NA, 
          junk2 = NA, 
          enz = "PTE", 
          unique_id = "PTE_catact_2NH",
          .before = 1) %>%
  add_row(id = "alt2", 
          effect = w_mean_epi, 
          muts = 4, 
          junk1 = NA, 
          junk2 = NA, 
          enz = "PTE", 
          unique_id = "PTE_catact_2NH",
          .before = 1) %>%
  select(-c("junk1", "junk2")) %>%
  mutate(muts = factor(muts)) %>%
  ggplot(aes(x = muts, y = effect)) +
  geom_point(shape = 1, size = 3) +
  labs(x = "Step",
       y = expression("log"[10]*" Function") ) +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), 
        axis.ticks = element_line(size = 0.2, color = "black"), 
        text = element_text(size = 9), 
        axis.text = element_text(size = 8, color = "black"))

fig_4_new

#ggsave("fig_4A_new.svg", fig_4_new, width = 180/2.5, height = 247/4, dpi = 300, units = "mm")

Next one is MPH

with_epi = fit_land %>%
  filter(unique_id == "MPH_catact_ZnPTM" & id == "10001") %>%
  pull(effect)

epi = higher_df %>% filter(partial_id == "MPH_catact_ZnPTM" & genotype == "x000x") %>% pull(avg)
mean_epi = adaptive_traj %>% filter(unique_id == "MPH_catact_ZnPTM" & genotype == "x000x") %>% pull(means)

wo_epi = with_epi - epi
w_mean_epi = with_epi - epi + mean_epi

fig_4b_new = fit_land %>%
  filter(unique_id == "MPH_catact_ZnPTM") %>%
  filter(id == "10000" | id == "10001" | id == "11001" | id == "11101" | id == "11111") %>%
  add_row(id = "00000", 
          effect = 0, 
          muts = 0, 
          junk1 = NA, 
          junk2 = NA, 
          enz = "MPH", 
          unique_id = "MPH_catact_ZnPTM",
          .before = 1) %>%
  add_row(id = "alt", 
          effect = wo_epi, 
          muts = 2, 
          junk1 = NA, 
          junk2 = NA, 
          enz = "MPH", 
          unique_id = "MPH_catact_ZnPTM") %>%
  add_row(id = "alt2", 
          effect = w_mean_epi, 
          muts = 2, 
          junk1 = NA, 
          junk2 = NA, 
          enz = "MPH", 
          unique_id = "MPH_catact_ZnPTM") %>%
  select(-c("junk1", "junk2")) %>%
  mutate(muts = factor(muts)) %>%
  ggplot(aes(x = muts, y = effect)) +
  geom_point(shape = 1, size = 3) +
  labs(x = "Step",
       y = expression("log"[10]*" Function") ) +
  theme_classic() +
  theme(axis.line = element_line(size = 0.2, color = "black"), 
        axis.ticks = element_line(size = 0.2, color = "black"), 
        text = element_text(size = 9), 
        axis.text = element_text(size = 8, color = "black"))

fig_4b_new

#ggsave("fig_4B_new.svg", fig_4b_new, width = 180/2.5, height = 247/4, dpi = 300, units = "mm")

Correlation between F and epistasis

Here we attempt to see whether the function of the genotype F is an indicator of the magnitude of epistasis

higher_df_start_to_epi = higher_df %>%
  unite("partial_id", c(Enzyme, Measurement, Condition)) %>%
  mutate(from = str_replace_all(genotype, "x", "0"),
         to = str_replace_all(genotype, "x", "1"))

start_effect = c()
for(i in 1:dim(higher_df_start_to_epi)[1]) {
  
  if(!str_detect(higher_df_start_to_epi$from[i], "1")) {
    start_effect = c(start_effect, 0) # If starts from WT, start effect is 0
  } else {
    start_effect = c(start_effect, filter(fit_land, unique_id == higher_df_start_to_epi$partial_id[i] & id == higher_df_start_to_epi$from[i]) 
  %>% pull(effect))
  }
}

higher_df_start_to_epi$start_effect = start_effect

The higher_df_start_to_epi tibble contains information regarding specific epistasis in the avg variable and starting effect of the genotype in the start_effect. We can plot the correlation for all landscapes

higher_df_start_to_epi %>%
  filter(mutations > 1) %>%
  ggplot(aes(x = start_effect, y = avg)) +
  geom_point() +
  theme_classic()

Then again for adaptive landscapes only

higher_df_start_to_epi %>%
  filter(mutations > 1 & partial_id %in% adaptive_trajectories) %>%
  ggplot(aes(x = start_effect, y = avg)) +
  geom_point() +
  theme_classic()

What if we facet adaptive landscapes by order?

higher_df_start_to_epi %>%
  filter(mutations > 1 & partial_id %in% adaptive_trajectories) %>%
  ggplot(aes(x = start_effect, y = avg)) +
  geom_point() +
  facet_wrap(~ mutations) +
  theme_classic()

Due to the differences in strength of functional affects across landscapes, maybe its important to separate by landscape and see whether there is a difference between the correlation of mutation effects and starting function VS the correlation of epistasis and starting function

trajectory_names = c(
  DHFR_ic75_palmer = "DHFR Palmer et al.",
  DHFR_ki_trajg = "DHFR Traj. G",
  DHFR_ki_trajr = "DHFR Traj. R",
  MPH_catact_ZnPTM = "MPH Traj. Zn",
  NfsA_ec50_2039 = "NfsA Traj. 20_39",
  NfsA_ec50_3637 = "NfsA Traj. 36_37",
  `OXA-48_ic50_CAZtraj1` = "OXA-48 CAZ Traj. 1",
  `OXA-48_ic50_CAZtraj2` = "OXA-48 CAZ Traj. 2",
  `OXA-48_ic50_CAZtraj3` = "OXA-48 CAZ Traj. 3",
  PTE_catact_2NH = "PTE Traj. 2NH",
  TEM_MIC_weinreich = "TEM Weinreich et al.")

global_labeller = labeller(
  partial_id = trajectory_names
)

dimish_del_f = higher_df_start_to_epi %>%
  filter(mutations == 1 & partial_id %in% adaptive_trajectories) %>%
  ggplot(aes(x = start_effect, y = avg)) +
  stat_cor(aes(label = paste(..r.label.., sep = "~~")), method = "spearman", cor.coef.name = "rho", label.x.npc = "middle",
  label.y.npc = "top", r.accuracy = 0.01, size = 2.5, color = "black") +
  geom_point() +
  geom_smooth(method = 'loess', formula = y ~ x) +
  facet_wrap(~ partial_id,
             scales = "free",
             labeller = global_labeller) +
  theme_classic() +
  labs(x = expression("Starting Function ("*italic("F")*")"), 
       y = expression(Delta*italic("F"))) +
  theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
        legend.position = "none", axis.line=element_line()) #+ 
 # scale_y_continuous(limits=c(-5,5)) +
  #scale_x_continuous(limits=c(-3,5))

dimish_epi = higher_df_start_to_epi %>%
  filter(mutations > 1 & partial_id %in% adaptive_trajectories) %>%
  ggplot(aes(x = start_effect, y = avg)) +
  stat_cor(aes(label = paste(..r.label.., sep = "~~")), method = "spearman", cor.coef.name = "rho", label.x.npc = "middle",
  label.y.npc = "top", r.accuracy = 0.01, size = 2.5, color = "black") +
  geom_point() +
  geom_smooth(method = 'loess', formula = y ~ x) +
  facet_wrap(~ partial_id,
             scales = "free",
             labeller = global_labeller) +
  theme_classic() +
  labs(x = expression("Starting Function ("*italic("F")*")"), 
       y = expression(epsilon)) +
  theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
        legend.position = "none", axis.line=element_line())

dimish_del_f

dimish_epi

#ggsave("supp_fig_3A.svg", dimish_del_f, width = 180, height = 247/2, dpi = 300, units = "mm")
#ggsave("supp_fig_3B.svg", dimish_epi, width = 180, height = 247/2, dpi = 300, units = "mm")